Sources:

The code below generates this plot:
from datetime import datetime as dt
from os.path import basename, exists
import pandas as pd
import plotly.express as px
import re
from urllib.request import urlretrieve
Helper for:
idxdef get_table(url, idx, path=None):
if path is None:
name = basename(url)
path = f'{name}.html'
if not exists(path):
print(f'Downloading {url} to {path}')
urlretrieve(url, path)
tables = pd.read_html(path)
table = tables[idx]
return table
Pull table from Wikipedia article: Motor vehicle deaths in U.S. by year
cars_url = 'https://en.wikipedia.org/wiki/Motor_vehicle_fatality_rate_in_U.S._by_year'
cars_path = 'Motor_vehicle_fatality_rate_in_U.S._by_year.html'
cars = get_table(cars_url, idx=0)
cars
| Year | Deaths | VMT– Vehicle miles traveled (billions) | Fatalities per 100 million VMT | Population | Fatalities per 100,000 population | Change in per capita fatalities from previous year | |
|---|---|---|---|---|---|---|---|
| 0 | 1899 | 26[5] | NaN | NaN | NaN | NaN | NaN |
| 1 | 1900 | 36 | NaN | NaN | 76094000.0 | 0.05 | NaN |
| 2 | 1901 | 54 | NaN | NaN | 77584000.0 | 0.07 | 47.1% |
| 3 | 1902 | 79 | NaN | NaN | 79163000.0 | 0.10 | 43.4% |
| 4 | 1903 | 117 | NaN | NaN | 80632000.0 | 0.15 | 45.4% |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 116 | 2015 | 35485 | 3095.0 | 1.15 | 321370000.0 | 11.06 | 7.6% |
| 117 | 2016[8] | 37806 | 3174.0 | 1.19 | 323121000.0 | 11.59 | 4.8% |
| 118 | 2017[8][9] | 37473 | 3213.0 | 1.16 | 326213213.0 | 11.40 | -1.6% |
| 119 | 2018[9][10] | 36560 | 3223.0 | 1.13 | 327096265.0 | 11.18 | -1.9% |
| 120 | 2019[11][12] | 36120 | NaN | 1.10 | NaN | NaN | -1.2% |
121 rows × 7 columns
for k in cars:
if cars[k].dtype == 'object':
cars[k] = cars[k].str.replace('(?: ?\[\d+\])+$', '')
cars = (
cars
.astype({'Year': int, 'Deaths': int})
.set_index('Year')
.Deaths
)
cars.loc[2020] = 38360
cars
Year
1899 26
1900 36
1901 54
1902 79
1903 117
...
2016 37806
2017 37473
2018 36560
2019 36120
2020 38360
Name: Deaths, Length: 122, dtype: int64
Pull table from Wikipedia article: United States military casualties of war
wars_url = 'https://en.wikipedia.org/wiki/United_States_military_casualties_of_war'
wars_path = 'United_States_military_casualties_of_war.html'
wars = get_table(wars_url, idx=0)
wars
| War or conflict | Date | Total U.S. deaths | Wounded | Total U.S. casualties | Missing | Sources and notes | Deaths as percentage of total population | |||
|---|---|---|---|---|---|---|---|---|---|---|
| War or conflict | Date | Combat | Other | Total | Wounded | Total U.S. casualties | Missing | Sources and notes | Deaths as percentage of total population | |
| 0 | American Revolutionary War | 1775–1783 | 8000 | 17000 | 25000 | 25000 | 50000 | NaN | NaN | NaN |
| 1 | Northwest Indian War | 1785–1796 | 1,056+ | NaN | 1,056+ | 825+ | 1,881+ | NaN | [1][2][3] | NaN |
| 2 | Quasi-War | 1798–1800 | 20 | 494[4] | 514 | 42 | 556 | NaN | [4][5] | NaN |
| 3 | First Barbary War | 1801–1805 | 35 | 39 | 74 | 64 | 138 | NaN | [6][7]: 25–27 [8][9] | NaN |
| 4 | Other actions against pirates | 1800–1900 | 36 | 158+[7]: 42 | 194+ | 100+ | 294+ | NaN | [5][10][11][b] | NaN |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 77 | War in Afghanistan | 2001-2021 | 1,833 [83] | 385[84][83] | 2218 | 20,093[83] | 22311 | NaN | [f][83][84] | NaN |
| 78 | Iraq War | 2003–2011 | 3,836* | 961 | 4497 | 32222 | 36710 | 3 | [83] | NaN |
| 79 | Intervention against the Islamic State of Iraq... | 2014–present | 16 | 60 | 76 | 81 | 157 | NaN | [83] | NaN |
| 80 | Raid on Yemen | 2017 | 1 | NaN | 1 | 3 | 4 | NaN | [85] | NaN |
| 81 | Total | 1775–2019 | 666,441+ | 673,929+ | 1,354,664+ | 1,498,240+ | 2,852,901+ | 40,031+ | NaN | NaN |
82 rows × 10 columns
~/+ characterswars.columns = [ col[1] for col in wars.columns ] # Flatten column index
for k in wars:
if wars[k].dtype == 'object':
wars[k] = wars[k].str.replace('(?: ?\[\d+\](?:\:\s?\d+(?:-\d+)?)?)+$', '')
wars[k] = wars[k].str.replace('[~+]$', '')
wars = wars.rename(columns={'War or conflict': 'War'}).set_index('War')[['Date','Total']]
wars = wars.dropna(subset=['Date', 'Total'], how='any')
wars.Total = wars.Total.str.replace(',', '')
year = dt.now().year
wars.Date = wars.Date.str.replace('present', str(year))
wars
| Date | Total | |
|---|---|---|
| War | ||
| American Revolutionary War | 1775–1783 | 25000 |
| Northwest Indian War | 1785–1796 | 1056 |
| Quasi-War | 1798–1800 | 514 |
| First Barbary War | 1801–1805 | 74 |
| Other actions against pirates | 1800–1900 | 194 |
| ... | ... | ... |
| War in Afghanistan | 2001-2021 | 2218 |
| Iraq War | 2003–2011 | 4497 |
| Intervention against the Islamic State of Iraq and Syria | 2014–2021 | 76 |
| Raid on Yemen | 2017 | 1 |
| Total | 1775–2019 | 1354664 |
77 rows × 2 columns
Parse dates into start and end years, and numeric casualty ranges into lo and hi estimates
def parse_ints(years, min_key, max_key):
matches = re.findall('\d+', years)
years = [ int(year) for year in matches ]
m = min(years)
M = max(years)
return {min_key: m, max_key: M}
wars = pd.concat(
[
wars,
wars['Date'].apply(parse_ints, min_key='start', max_key='end').apply(pd.Series),
wars['Total'].apply(parse_ints, min_key='lo', max_key='hi').apply(pd.Series),
],
axis=1,
)
wars
| Date | Total | start | end | lo | hi | |
|---|---|---|---|---|---|---|
| War | ||||||
| American Revolutionary War | 1775–1783 | 25000 | 1775 | 1783 | 25000 | 25000 |
| Northwest Indian War | 1785–1796 | 1056 | 1785 | 1796 | 1056 | 1056 |
| Quasi-War | 1798–1800 | 514 | 1798 | 1800 | 514 | 514 |
| First Barbary War | 1801–1805 | 74 | 1801 | 1805 | 74 | 74 |
| Other actions against pirates | 1800–1900 | 194 | 1800 | 1900 | 194 | 194 |
| ... | ... | ... | ... | ... | ... | ... |
| War in Afghanistan | 2001-2021 | 2218 | 2001 | 2021 | 2218 | 2218 |
| Iraq War | 2003–2011 | 4497 | 2003 | 2011 | 4497 | 4497 |
| Intervention against the Islamic State of Iraq and Syria | 2014–2021 | 76 | 2014 | 2021 | 76 | 76 |
| Raid on Yemen | 2017 | 1 | 2017 | 2017 | 1 | 1 |
| Total | 1775–2019 | 1354664 | 1775 | 2019 | 1354664 | 1354664 |
77 rows × 6 columns
Use the low-end estimates, drop final "total" row
wars = (
wars
.iloc[:-1]
.copy()
[['start', 'end', 'lo']]
.rename(columns={'lo': 'deaths'})
.sort_values('end')
)
wars
| start | end | deaths | |
|---|---|---|---|
| War | |||
| American Revolutionary War | 1775 | 1783 | 25000 |
| Northwest Indian War | 1785 | 1796 | 1056 |
| Quasi-War | 1798 | 1800 | 514 |
| First Barbary War | 1801 | 1805 | 74 |
| Chesapeake–Leopard affair | 1807 | 1807 | 3 |
| ... | ... | ... | ... |
| Iraq War | 2003 | 2011 | 4497 |
| Raid on Yemen | 2017 | 2017 | 1 |
| Colombia | 1994 | 2021 | 8 |
| War in Afghanistan | 2001 | 2021 | 2218 |
| Intervention against the Islamic State of Iraq and Syria | 2014 | 2021 | 76 |
76 rows × 3 columns
Approximate each war's deaths as being evenly spread over the years in which the war was waged:
def spread_war(r):
start, end, deaths = r['start'], r['end'], r['deaths']
years = range(start, end + 1)
num_years = len(years)
deaths_per_year = deaths / num_years
return [ dict(war_deaths=deaths_per_year, year=year) for year in years ]
war_deaths_per_year = (
wars
.apply(spread_war, axis=1)
.explode()
.apply(pd.Series)
.astype({'year': int})
.reset_index()
.groupby('year')
.sum()
)
war_deaths_per_year
| war_deaths | |
|---|---|
| year | |
| 1775 | 2777.777778 |
| 1776 | 2777.777778 |
| 1777 | 2777.777778 |
| 1778 | 2777.777778 |
| 1779 | 2777.777778 |
| ... | ... |
| 2017 | 116.404762 |
| 2018 | 115.404762 |
| 2019 | 115.404762 |
| 2020 | 115.404762 |
| 2021 | 115.404762 |
239 rows × 1 columns
deaths_per_year = pd.concat(
[
war_deaths_per_year['war_deaths'].rename('War'),
cars.rename('Cars'),
],
axis=1,
)
deaths_per_year
| War | Cars | |
|---|---|---|
| 1775 | 2777.777778 | NaN |
| 1776 | 2777.777778 | NaN |
| 1777 | 2777.777778 | NaN |
| 1778 | 2777.777778 | NaN |
| 1779 | 2777.777778 | NaN |
| ... | ... | ... |
| 2017 | 116.404762 | 37473.0 |
| 2018 | 115.404762 | 36560.0 |
| 2019 | 115.404762 | 36120.0 |
| 2020 | 115.404762 | 38360.0 |
| 2021 | 115.404762 | NaN |
245 rows × 2 columns
Fill in missing years, add cumulative sums
start, end = deaths_per_year.index.min(), deaths_per_year.index.max()
years = pd.Index(range(start, min(2020, end) + 1), name='year')
deaths = years.to_frame().merge(deaths_per_year, left_index=True, right_index=True)[['War', 'Cars']].fillna(0)
deaths['War (Total)'] = deaths['War'].cumsum()
deaths['Cars (Total)'] = deaths['Cars'].cumsum()
deaths
| War | Cars | War (Total) | Cars (Total) | |
|---|---|---|---|---|
| 1775 | 2777.777778 | 0.0 | 2.777778e+03 | 0.0 |
| 1776 | 2777.777778 | 0.0 | 5.555556e+03 | 0.0 |
| 1777 | 2777.777778 | 0.0 | 8.333333e+03 | 0.0 |
| 1778 | 2777.777778 | 0.0 | 1.111111e+04 | 0.0 |
| 1779 | 2777.777778 | 0.0 | 1.388889e+04 | 0.0 |
| ... | ... | ... | ... | ... |
| 2016 | 115.404762 | 37806.0 | 1.345876e+06 | 3720188.0 |
| 2017 | 116.404762 | 37473.0 | 1.345992e+06 | 3757661.0 |
| 2018 | 115.404762 | 36560.0 | 1.346108e+06 | 3794221.0 |
| 2019 | 115.404762 | 36120.0 | 1.346223e+06 | 3830341.0 |
| 2020 | 115.404762 | 38360.0 | 1.346339e+06 | 3868701.0 |
244 rows × 4 columns
fig = px.line(
deaths[['War (Total)', 'Cars (Total)']],
labels={
'variable': '',
'value': 'Deaths',
'index': 'Year',
}
)
fig.update_layout(
title='U.S. deaths over time: war (combat + non-combat) vs. cars (crashes only)', title_x=0.5
)
fig
fig.write_image('car_war_deaths.png')
